library(tidyverse)
library(caret)
library(e1071)
library(factoextra)
library(plotly)
library(ggcorrplot)
library(fields)
brain_data <- read_csv("../data/IBIS_brain_data_ex.csv")
brain_data <- brain_data %>%
  select(names(brain_data)[grepl("V24|RiskGroup|CandID", names(brain_data))]) %>%
  select(CandID:Uncinate_R_V24, RiskGroup:V24_VDQ) %>%
  drop_na()

brain_data_x <- brain_data %>% select(EACSF_V24:Uncinate_R_V24)

Dimension Reduction

Principal Components Analysis

pca_brain <- princomp(x=brain_data_x, cor=TRUE,scores=TRUE)

# Look at screen plot to view amount of variance explained by PC
fviz_eig(pca_brain)

# Just look at first 2 PCs
load_subset <- data.frame(pca_brain$loadings[,1:2]) %>% 
  rownames_to_column(var = "variable") %>%
  pivot_longer(cols=c("Comp.1", "Comp.2"), names_to="comp_number", values_to="loading")

# Plot loadings
ggplot(data=load_subset, mapping=aes(x=variable, y=loading))+
  geom_bar(stat="identity")+
  facet_grid(comp_number~.)+ 
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Hard to interpret, need to reduce size
load_subset_normed <- data.frame(pca_brain$loadings[,1:2]) %>% 
  rownames_to_column(var = "variable") %>%
  mutate(Comp.1 = scale(Comp.1),
         Comp.2 = scale(Comp.2)) %>%
  pivot_longer(cols=c("Comp.1", "Comp.2"), names_to="comp_number", values_to="loading")

ggplot(data=load_subset_normed, mapping=aes(x=loading))+
  geom_histogram()+
  facet_grid(comp_number~.)+ 
  theme_bw()

load_subset_normed <- load_subset_normed %>%
  filter(loading>0.5|loading< -0.5) %>%
  mutate(large_loading = factor(ifelse(loading>1.96|loading< -1.96, 1, 0)))

ggplot(data=load_subset_normed, mapping=aes(x=variable, y=loading, fill=large_loading))+
  geom_bar(stat="identity")+
  facet_grid(comp_number~.)+ 
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Can look at PCA scores for each person and plot as well
fviz_pca_ind(pca_brain,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

# Do these relate to diagnosis?
fviz_pca_ind(pca_brain,
             col.ind = brain_data$RiskGroup, # color by groups
             palette = c("#0072B2", "#D55E00", "#CC79A7"),
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",
             label = "none",
             repel = TRUE
             )

# Can look at high variables contribute to components, labels make this messy
fviz_pca_var(pca_brain,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

fviz_pca_var(pca_brain,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             select.var = list("cos2"=10),
             repel = TRUE     # Avoid text overlapping
             )

# Can extract scores, use for further analysis
pc_scores <- pca_brain$scores[,1:2]
brain_data_w_pcs <- data.frame(brain_data, pc_scores)

Visualizing results

# Create train and testing sets
brain_data_w_pcs <- brain_data_w_pcs %>% 
  filter(grepl("HR", RiskGroup)) %>%
  mutate(RiskGroup=fct_relevel(factor(RiskGroup), "HR-Neg"))
tt_indices <- createDataPartition(y=brain_data_w_pcs$RiskGroup, p=0.6, list=FALSE)

# Train using PCs with logistic regression
train_data <- brain_data_w_pcs[tt_indices,]
test_data <- brain_data_w_pcs[-tt_indices,]

model_fit <- glm(RiskGroup~`Comp.1`+`Comp.1`, data=train_data, family=binomial)
test_data$model_prob <- predict(model_fit, type="response", newdata = test_data)
test_data$model_pred <- 
  fct_relevel(factor(ifelse(test_data$model_prob>0.25, "HR-ASD", "HR-Neg")), "HR-Neg")

# Plot PCs by probability, make plot interactive
ggplotly(ggplot(test_data, mapping=aes(x=`Comp.1`, y=`Comp.2`, 
                                       color=model_prob, label=RiskGroup))+
  geom_point()+
  scale_color_gradient2(low="black", mid = "red", high = "yellow", 
                        midpoint = median(test_data$model_prob))+
  theme_bw())
# Histo of PCs by diagnosis
ggplotly(ggplot(test_data, mapping=aes(x=`Comp.1`, fill=RiskGroup))+
  geom_histogram())
# Plot confusion matrix as heatmap
test_confusion_matrix <- confusionMatrix(reference=test_data$RiskGroup,
                                         data=test_data$model_pred)$table

# Convert to dataframe
confusion_matrix_df <- data.frame("obs"=c("HR-Neg", "HR-Neg", "HR-ASD", "HR-ASD"),
                                  "pred"=c("HR-Neg", "HR-ASD", "HR-Neg", "HR-ASD"),
                                  "count"=as.vector(confusionMatrix(reference=test_data$RiskGroup,
                 data=test_data$model_pred)$table))

ggplot(data=confusion_matrix_df, mapping=aes(x=obs, y=pred, fill=count))+
  geom_raster()+
  geom_text(mapping=aes(label=count), color="white", size=10)+
  scale_fill_gradient(low="black", high="red")+
  #coord_flip()+
  theme_minimal()

# Heatmap with PCs